clear
set more off
set mem 1g
# delimit ;
/*****************************************************************************************************************;
**This dofile starts from collapsedGPS2004 and generates ArsenicContamination_2004_NEW;
other pre-generated datasets this dofile also uses:
hh_simulatedmindistance_2004_grid
some more raw files
*****************************************************************************************************************/;

*****************************************************************************************************************;
*Paths;
*****************************************************************************************************************;

*access files in subdirectory generated by data creating dofiles;
local indataset "dtafiles/collapsedGPS2004";
local outdataset "dtafiles/HouseholdArsenic_2004.dta";

*****************************************************************************************************************;
*Main;
*****************************************************************************************************************;

use "rawdta/DHS2004/BDHR4JDT/BDHR4JFL.DTA";

keep hv001 hv002 hv003 hv005 sh21 sh23  sh25 sh27 sh44a shdistr;

**HW: renames variables for merge**;
ren hv001 dhsid04;
ren hv002 v002;

tempfile survey;
save `survey', replace;

use "dtafiles/hh_simulatedmindistance_2004_grid.dta";
for var  mean_mindistanceU-wfraction_mindistanceC_1m: rename X X_G;
sort dhsid;
gen year=2004;
tempfile sim2004;
save `sim2004', replace;

use "`indataset'";
merge 1:1 dhsid using `sim2004';
drop _merge;
merge 1:m dhsid04 using `survey';
drop if _m==2;
drop _merge; 

**HW: recode of arsenic variables**;
ren sh21 dishwashingsource;
ren sh23 sourcetestedwater;
ren sh25 heardofarsenic;
ren sh27 wellmarking;
ren sh44a arsenictest;

label variable dishwashingsource "water source used for dishwashing";
label variable sourcetestedwater "type of water source household says uses for drinking";
label variable heardofarsenic "whether heard of arsenic contamination problem";
label variable wellmarking "marking on household's tubewell";
label variable arsenictest "test of hh's water source in 2004 DHS";

**HW: DHS gives arsenic test results in ranges, so take mean of range*;
gen numarsenictest=0 if arsenictest==1;
replace numarsenictest=10 if arsenictest==2;
replace numarsenictest=17.5 if arsenictest==3;
replace numarsenictest=25 if arsenictest==4;
replace numarsenictest=37.5 if arsenictest==5;
replace numarsenictest=50 if arsenictest==6;
replace numarsenictest=75 if arsenictest==7;
replace numarsenictest=100 if arsenictest==8;
replace numarsenictest=175 if arsenictest==9;
replace numarsenictest=200 if arsenictest==10;
replace numarsenictest=375 if arsenictest==11;
replace numarsenictest=1000 if arsenictest==12;
replace numarsenictest=2750 if arsenictest==13;
drop arsenictest;

label variable numarsenictest "mean of range of 2004 DHS household arsenic test";

** more contamination var;
gen contaminatedwell=numarsenictest>=50;
gen paintedwell=inlist(wellmarking,1,2) if wellmarking!=8;
gen redwell=wellmarking==1 if paintedwell==1;

**Drop 3 districts where BGS did no testing after confirming that districts 3,46 and 84
**are ones where no cluster has any wells within 5 miles ;

drop if inlist(shdistr,3,46,84);

gen surfacewaterD=inlist(sourcetestedwater,31,32) if sourcetestedwater!=.;
gen surfacewaterDW=inlist(dishwashingsource,31,32) if dishwashingsource!=.;
gen contaminatedorsurface=contaminatedwell==1 | surfacewaterD==1 if contaminatedwell!=. & surfacewaterD!=.;

order v002 hv003 hv005 dhsid dhsid04 shdistr year
contaminatedorsurface
wfraction_mindistanceC_1m_G	fraction_mindistanceC_1m_G fractioncont5 numcontaminated5 as_walkable5_g numwalkable // contamination
surfacewaterD surfacewaterDW contaminatedorsurface dishwashingsource sourcetestedwater // household water source
heardofarsenic numarsenictest contaminatedwell redwell paintedwell wellmarking // household arsenic
fraction_mindistanceU_1m_G wfraction_mindistanceU_1m_G mean_mindistanceU_G wmean_mindistanceU_G  // clean access 
;

saveold "`outdataset'", replace;

